1 Summary

2 Loading

2.1 packages

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggrepel)
library(funFEM)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: fda
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
## 
##     matplot
## Loading required package: elasticnet
## Loading required package: lars
## Loaded lars 1.2
theme_set(theme_classic(18) +
            theme(legend.position = "bottom"))

2.2 data

# total_cases = readr::read_csv("total_cases.csv")
# 
# cases_long = total_cases %>% 
#   dplyr::select(-World) %>% 
#   tidyr::pivot_longer(cols =  -date, 
#                       names_to = "regions", 
#                       values_to = "cum_cases")
rawdata = readr::read_csv("owid-covid-data.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   iso_code = col_character(),
##   continent = col_character(),
##   location = col_character(),
##   date = col_date(format = ""),
##   new_tests = col_logical(),
##   total_tests = col_logical(),
##   total_tests_per_thousand = col_logical(),
##   new_tests_per_thousand = col_logical(),
##   new_tests_smoothed = col_logical(),
##   new_tests_smoothed_per_thousand = col_logical(),
##   tests_per_case = col_logical(),
##   positive_rate = col_logical(),
##   tests_units = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 120294 parsing failures.
##  row                      col           expected          actual                  file
## 1074 new_tests                1/0/T/F/TRUE/FALSE 75.0            'owid-covid-data.csv'
## 1074 total_tests              1/0/T/F/TRUE/FALSE 75.0            'owid-covid-data.csv'
## 1074 total_tests_per_thousand 1/0/T/F/TRUE/FALSE 0.008           'owid-covid-data.csv'
## 1074 new_tests_per_thousand   1/0/T/F/TRUE/FALSE 0.008           'owid-covid-data.csv'
## 1074 tests_units              1/0/T/F/TRUE/FALSE tests performed 'owid-covid-data.csv'
## .... ........................ .................. ............... .....................
## See problems(...) for more details.
subdata = rawdata %>% 
  dplyr::select(iso_code, location, date, new_cases, new_cases_per_million, new_cases_smoothed_per_million, total_cases) %>% 
  dplyr::filter(complete.cases(new_cases), 
                complete.cases(new_cases_smoothed_per_million),
                location != "World")

cases30_data = subdata %>% 
  group_by(location) %>% 
  dplyr::arrange(date) %>% 
  dplyr::summarise(firstdate_cases30 = date[(new_cases >= 30)] %>% first,
                   cases30 = new_cases[(new_cases >= 30)] %>% first) %>% 
  dplyr::filter(complete.cases(firstdate_cases30))
## `summarise()` ungrouping output (override with `.groups` argument)

3 Visualisation

mergedata = subdata %>% 
  left_join(cases30_data, by = "location") %>% 
  dplyr::mutate(
    days_since_cases30 = difftime(time1 = date, time2 = firstdate_cases30, units = "days"),
    days_since_cases30_int = days_since_cases30 %>% as.integer) %>% 
  dplyr::filter(as.integer(days_since_cases30) >= 0, 
                total_cases >= 50000)


mergedata %>% 
  ggplot(aes(x = days_since_cases30_int, 
             y = new_cases_per_million, 
             group = location)) +
  # geom_point(alpha = 0.5) +
  geom_line() +
  scale_y_continuous(trans = "log1p", labels = scales::comma, 
                     breaks = c(1, 10, 1e2, 1e3, 1e4, 1e5, 1e6))
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis

mergedata %>% 
  ggplot(aes(x = days_since_cases30_int, 
             y = new_cases_smoothed_per_million, 
             group = location)) +
  # geom_point(alpha = 0.5) +
  geom_line() +
  scale_y_continuous(trans = "log1p", labels = scales::comma)

4 Functional clustering example

# library(funHDDC)
# data("trigo")
# dim(trigo)
# 
# basis <- create.bspline.basis(c(0,1), nbasis=25)
# var1 <- smooth.basis(argvals=seq(0,1,length.out = 100),y=t(trigo[,1:100]),fdParobj=basis)$fd
# 
# res.uni <- funHDDC(var1,K=2,model="AkBkQkDk",init="kmeans",threshold=0.2)
# 
# plot(var1)
# 
# # matplot(trigo, type = "l")
head(day.5)
CanadianWeather_Temp <- CanadianWeather$dailyAv[,,"Temperature.C"]
head(CanadianWeather_Temp)
dim(CanadianWeather_Temp)
matplot(CanadianWeather_Temp, type = "l")

basis <- create.bspline.basis(c(0, 365), nbasis=21, norder=4) # norder=4 : cubic spline
fdobj <- smooth.basis(day.5, CanadianWeather_Temp, 
                      basis, fdnames=list("Day", "Station", "Deg C"))$fd
res <- funFEM(fdobj, K=4)
# plot(fdobj, col=res$cls, lwd=2, lty=1)
matplot(fdobj$coefs, type = "l", col = res$cls)

5 Applying functional clustering to COVID

mergedata_wide = mergedata %>% 
  pivot_wider(id_cols = days_since_cases30_int,
              names_from = location, 
              values_from = new_cases_smoothed_per_million) %>% 
  dplyr::arrange(days_since_cases30_int)

# mergedata_wide %>% 
#   summarise_all(.funs = list(~mean(is.na(.))))

mergedata_wide_zeroes_mat = mergedata_wide %>% 
  dplyr::mutate_all(.funs = coalesce, 0) %>% 
  as.data.frame %>% tibble::column_to_rownames("days_since_cases30_int") %>% as.matrix()

mergedata_wide_zeroes_mat %>% dim
## [1] 199  50
basis <- create.bspline.basis(c(0, nrow(mergedata_wide_zeroes_mat)), nbasis = 80, norder = 4) # norder=4 : cubic spline
fdobj <- smooth.basis(argvals = seq_len(nrow(mergedata_wide_zeroes_mat)), 
                      y = mergedata_wide_zeroes_mat, 
                      basis)$fd
res <- funFEM(fdobj, K = 20, model = "AB")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
# matplot(fdobj$coefs, type = "l", col = res$cls)
plotdf = mergedata %>% 
  left_join(tibble(
    location = colnames(mergedata_wide_zeroes_mat), 
    cluster = res$cls), by = "location") %>% 
  group_by(cluster, iso_code) %>% 
  dplyr::mutate(
    label = ifelse(days_since_cases30_int == max(days_since_cases30_int), iso_code, NA))



plotdf %>% 
  ggplot(aes(x = days_since_cases30_int, 
             y = new_cases_smoothed_per_million, 
             group = iso_code)) +
  geom_line() +
  geom_text_repel(aes(label = label), colour = "#3079ff", fontface = 2) +
  facet_wrap(~cluster, scales = "free_y", labeller = label_both) +
  labs(x = "Days since first reaching 30 confirmed daily cases", 
       y = "New cases per million (smoothed)", 
       title = "Countries with similar COVID-19 new cases trajectories", 
       subtitle = "(I don't think this clustering is great...)",
       caption = "https://ourworldindata.org/coronavirus")
## Warning: Removed 3837 rows containing missing values (geom_text_repel).

6 Session Info

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.6.2 (2019-12-12)
##  os       macOS Mojave 10.14.6        
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_AU.UTF-8                 
##  ctype    en_AU.UTF-8                 
##  tz       Australia/Melbourne         
##  date     2020-08-24                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package     * version  date       lib source                            
##  assertthat    0.2.1    2019-03-21 [1] CRAN (R 3.6.0)                    
##  backports     1.1.8    2020-06-17 [1] CRAN (R 3.6.2)                    
##  blob          1.2.1    2020-01-20 [1] CRAN (R 3.6.0)                    
##  broom         0.7.0    2020-07-09 [1] CRAN (R 3.6.2)                    
##  cellranger    1.1.0    2016-07-27 [1] CRAN (R 3.6.0)                    
##  cli           2.0.2    2020-02-28 [1] CRAN (R 3.6.0)                    
##  colorspace    1.4-1    2019-03-18 [1] CRAN (R 3.6.0)                    
##  crayon        1.3.4    2017-09-16 [1] CRAN (R 3.6.0)                    
##  DBI           1.1.0    2019-12-15 [1] CRAN (R 3.6.0)                    
##  dbplyr        1.4.4    2020-05-27 [1] CRAN (R 3.6.2)                    
##  digest        0.6.25   2020-02-23 [1] CRAN (R 3.6.0)                    
##  dplyr       * 1.0.0    2020-05-29 [1] CRAN (R 3.6.2)                    
##  elasticnet  * 1.3      2020-05-15 [1] CRAN (R 3.6.2)                    
##  ellipsis      0.3.1    2020-05-15 [1] CRAN (R 3.6.2)                    
##  evaluate      0.14     2019-05-28 [1] CRAN (R 3.6.0)                    
##  fansi         0.4.1    2020-01-08 [1] CRAN (R 3.6.0)                    
##  farver        2.0.3    2020-01-16 [1] CRAN (R 3.6.0)                    
##  fda         * 5.1.4    2020-04-20 [1] CRAN (R 3.6.2)                    
##  forcats     * 0.5.0    2020-03-01 [1] CRAN (R 3.6.2)                    
##  fs            1.4.2    2020-06-30 [1] CRAN (R 3.6.2)                    
##  funFEM      * 1.1      2015-03-13 [1] CRAN (R 3.6.0)                    
##  generics      0.0.2    2018-11-29 [1] CRAN (R 3.6.0)                    
##  ggplot2     * 3.3.2    2020-06-19 [1] CRAN (R 3.6.2)                    
##  ggrepel     * 0.8.2    2020-03-08 [1] CRAN (R 3.6.2)                    
##  glue          1.4.1    2020-05-13 [1] CRAN (R 3.6.2)                    
##  gtable        0.3.0    2019-03-25 [1] CRAN (R 3.6.0)                    
##  haven         2.3.1    2020-06-01 [1] CRAN (R 3.6.2)                    
##  hms           0.5.3    2020-01-08 [1] CRAN (R 3.6.0)                    
##  htmltools     0.5.0    2020-06-16 [1] CRAN (R 3.6.2)                    
##  httr          1.4.1    2019-08-05 [1] CRAN (R 3.6.0)                    
##  jsonlite      1.7.0    2020-06-25 [1] CRAN (R 3.6.2)                    
##  knitr         1.29     2020-06-23 [1] CRAN (R 3.6.2)                    
##  labeling      0.3      2014-08-23 [1] CRAN (R 3.6.0)                    
##  lars        * 1.2      2013-04-24 [1] CRAN (R 3.6.0)                    
##  lattice       0.20-41  2020-04-02 [1] CRAN (R 3.6.2)                    
##  lifecycle     0.2.0    2020-03-06 [1] CRAN (R 3.6.0)                    
##  lubridate   * 1.7.9    2020-06-08 [1] CRAN (R 3.6.2)                    
##  magrittr      1.5      2014-11-22 [1] CRAN (R 3.6.0)                    
##  MASS        * 7.3-51.6 2020-04-26 [1] CRAN (R 3.6.2)                    
##  Matrix      * 1.2-18   2019-11-27 [1] CRAN (R 3.6.2)                    
##  modelr        0.1.8    2020-05-19 [1] CRAN (R 3.6.2)                    
##  munsell       0.5.0    2018-06-12 [1] CRAN (R 3.6.0)                    
##  pillar        1.4.6    2020-07-10 [1] CRAN (R 3.6.2)                    
##  pkgconfig     2.0.3    2019-09-22 [1] CRAN (R 3.6.0)                    
##  purrr       * 0.3.4    2020-04-17 [1] CRAN (R 3.6.2)                    
##  R6            2.4.1    2019-11-12 [1] CRAN (R 3.6.0)                    
##  Rcpp          1.0.5    2020-07-06 [1] CRAN (R 3.6.2)                    
##  readr       * 1.3.1    2018-12-21 [1] CRAN (R 3.6.0)                    
##  readxl        1.3.1    2019-03-13 [1] CRAN (R 3.6.0)                    
##  reprex        0.3.0    2019-05-16 [1] CRAN (R 3.6.0)                    
##  rlang         0.4.7    2020-07-09 [1] CRAN (R 3.6.2)                    
##  rmarkdown     2.3.1    2020-06-23 [1] Github (rstudio/rmarkdown@b53a85a)
##  rstudioapi    0.11     2020-02-07 [1] CRAN (R 3.6.0)                    
##  rvest         0.3.5    2019-11-08 [1] CRAN (R 3.6.0)                    
##  scales        1.1.1    2020-05-11 [1] CRAN (R 3.6.2)                    
##  sessioninfo   1.1.1    2018-11-05 [1] CRAN (R 3.6.0)                    
##  stringi       1.4.6    2020-02-17 [1] CRAN (R 3.6.1)                    
##  stringr     * 1.4.0    2019-02-10 [1] CRAN (R 3.6.0)                    
##  tibble      * 3.0.3    2020-07-10 [1] CRAN (R 3.6.2)                    
##  tidyr       * 1.1.0    2020-05-20 [1] CRAN (R 3.6.2)                    
##  tidyselect    1.1.0    2020-05-11 [1] CRAN (R 3.6.2)                    
##  tidyverse   * 1.3.0    2019-11-21 [1] CRAN (R 3.6.0)                    
##  vctrs         0.3.1    2020-06-05 [1] CRAN (R 3.6.2)                    
##  withr         2.2.0    2020-04-20 [1] CRAN (R 3.6.2)                    
##  xfun          0.15     2020-06-21 [1] CRAN (R 3.6.2)                    
##  xml2          1.3.2    2020-04-23 [1] CRAN (R 3.6.2)                    
##  yaml          2.2.1    2020-02-01 [1] CRAN (R 3.6.1)                    
## 
## [1] /Library/Frameworks/R.framework/Versions/3.6/Resources/library